Scott Cole
21 May 2016
This notebook attempts to predict the overall rating of a burrito as a linear combination of its dimensions. Interpretation of these models is complicated by the significant correlations between dimensions (such as meat quality and non-meat filling quality).
In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
sns.set_style("white")
In [2]:
import util
df = util.load_burritos()
N = df.shape[0]
In [3]:
# Define predictors of the model
m_lm = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Wrap']
# Remove incomplete data
dffull = df[np.hstack((m_lm,'overall'))].dropna()
X = sm.add_constant(dffull[m_lm])
y = dffull['overall']
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print 1 - np.var(res.resid_pearson) / np.var(y)
In [4]:
# Visualize coefficients
from tools.plt import bar
newidx = np.argsort(-res.params.values)
temp = np.arange(len(newidx))
newidx = np.delete(newidx,temp[newidx==0])
bar(res.params[newidx],res.bse[newidx],X.keys()[newidx],'Overall rating\nLinear model\ncoefficient',
ylim =(0,.5),figsize=(11,3))
plt.plot()
figname = 'overall_metric_linearmodelcoef'
plt.savefig('C:/gh/fig/burrito/'+figname + '.png')
In [5]:
# Get all ingredient keys
startingredients = 29
ingredientkeys = df.keys()[startingredients:]
# Get all ingredient keys with at least 10 burritos
Nlim = 10
ingredientkeys = ingredientkeys[df.count()[startingredients:].values>=Nlim]
# Make a dataframe for all ingredients
dfing = df[ingredientkeys]
# Convert data to binary
for k in dfing.keys():
dfing[k] = dfing[k].map({'x':1,'X':1,1:1})
dfing[k] = dfing[k].fillna(0)
In [6]:
# Run a general linear model to predict overall burrito rating from ingredients
X = sm.add_constant(dfing)
y = df.overall
lm = sm.GLM(y,X)
res = lm.fit()
print(res.summary())
origR2 = 1 - np.var(res.resid_pearson) / np.var(y)
In [7]:
# Test if the variance explained in this linear model is significantly better than chance
np.random.seed(0)
Nsurr = 1000
randr2 = np.zeros(Nsurr)
for n in range(Nsurr):
Xrand = np.random.rand(X.shape[0],X.shape[1])
Xrand[:,0] = np.ones(X.shape[0])
lm = sm.GLM(y,Xrand)
res = lm.fit()
randr2[n] = 1 - np.var(res.resid_pearson) / np.var(y)
print 'p = ' , np.mean(randr2>origR2)
In [8]:
# Average each metric over each Location
# Avoid case issues; in the future should avoid article issues
df.Location = df.Location.str.lower()
m_Location = ['Location','N','Yelp','Google','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
tacoshops = df.Location.unique()
TS = len(tacoshops)
dfmean = pd.DataFrame(np.nan, index=range(TS), columns=m_Location)
for ts in range(TS):
dfmean.loc[ts] = df.loc[df.Location==tacoshops[ts]].mean()
dfmean['N'][ts] = sum(df.Location == tacoshops[ts])
dfmean.Location = tacoshops
In [16]:
# Note high correlations between features
m_Yelp = ['Google','Yelp','Hunger','Cost','Volume','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
M = len(m_Yelp)
dfmeancorr = dfmean[m_Yelp].corr()
from matplotlib import cm
clim1 = (-1,1)
plt.figure(figsize=(10,10))
cax = plt.pcolor(range(M+1), range(M+1), dfmeancorr, cmap=cm.bwr)
cbar = plt.colorbar(cax, ticks=(-1,-.5,0,.5,1))
cbar.ax.set_ylabel('Pearson correlation (r)', size=30)
plt.clim(clim1)
cbar.ax.set_yticklabels((-1,-.5,0,.5,1),size=20)
ax = plt.gca()
ax.set_yticks(np.arange(M)+.5)
ax.set_yticklabels(m_Yelp,size=25)
ax.set_xticks(np.arange(M)+.5)
ax.set_xticklabels(m_Yelp,size=25)
plt.xticks(rotation='vertical')
plt.xlim((0,M))
plt.ylim((0,M))
plt.tight_layout()
In [10]:
# GLM for Yelp: all dimensions
m_Yelp = ['Hunger','Cost','Tortilla','Temp','Meat','Fillings','Meat:filling',
'Uniformity','Salsa','Synergy','Wrap','overall']
dffull = dfmean[np.hstack((m_Yelp,'Yelp'))].dropna()
X = sm.add_constant(dffull[m_Yelp])
y = dffull['Yelp']
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
print(res.pvalues)
print 1 - np.var(res.resid_pearson) / np.var(y)
In [11]:
# GLM for Yelp: some dimensions
m_Yelp = ['Tortilla','overall']
dffull = dfmean[np.hstack((m_Yelp,'Yelp'))].dropna()
X = sm.add_constant(dffull[m_Yelp])
y = dffull['Yelp']
my_glm = sm.GLM(y,X)
res = my_glm.fit()
print(res.summary())
In [12]:
plt.figure(figsize=(4,4))
ax = plt.gca()
dfmean.plot(kind='scatter',x='Tortilla',y='Yelp',ax=ax,**{'s':40,'color':'k','alpha':.3})
plt.xlabel('Average Tortilla rating',size=20)
plt.ylabel('Yelp rating',size=20)
plt.xticks(np.arange(0,6),size=15)
plt.yticks(np.arange(0,6),size=15)
plt.ylim((2,5))
plt.tight_layout()
print sp.stats.spearmanr(dffull.Yelp,dffull.Tortilla)
figname = 'corr-Yelp-tortilla'
plt.savefig('C:/gh/fig/burrito/'+figname + '.png')
In [ ]: